package com.kdtech.analyse.Bbs;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.RegexUtils;

public class CzbtvBbsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		String[] regex = {
				"http://bbs.czbtv.com/dispbbs.asp[?]boardid=[0-9]*&ID=[0-9]*&replyID=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp[?]boardid=[0-9]*&ID=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp[?]boardid=[0-9]*&id=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp[?]boardid=[0-9]*&Id=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp[?]boardid=[0-9]*&page=1&ID=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp[?]BoardID=[0-9]*&id=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp[?]boardID=[0-9]*&page=1&ID=[0-9]*",
				"http://bbs.czbtv.com/dispbbs.asp\\?boardid=[0-9]*&amp.*",
				"http://www.czbtv.com/[a-z/]+t[0-9_]*.htm",
		};
		return RegexUtils.matchAny(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		if (!isDetailPage(urlMeta.getUrl())) {
		}
		NewsMeta bbs = new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt = urlMeta.getHtml();
		String url = urlMeta.getUrl();

		bbs.setUrl(url);
		String title = null;
		String content = null;
		Long date = null;
		String clickNum = null;

		Document doc = Jsoup.parse(htmltxt);
		
		if(doc.select("td.inforight b").size()>0)
		{
			title = doc.select("td.inforight b").first().text().replace(" ", "");
		}
		if(StringUtils.isBlank(title))
		{
			title = doc.select("title").text();
		}
		date = DateUtils.matchDate(doc.select("span.font10").text());
		if(date==null)
		{
			date = DateUtils.matchDate(url);
		}

		/**
		 * 初始化、截取 、设置作者信息*******************************************
		 *
		 * */
		String author = null;
		if(doc.select("span.username font").size()>0)
		{
			author = doc.select("span.username font").get(0).text();
		}
		bbs.setAuthor(author);
		/**
		 *                        	 **************************end *************************
		 *
		 **/
		content = HtmlCleaner.getContentHtml(url,  doc.select("div#textstyle_1"));
		if(StringUtils.isBlank(content))
		{
			content = HtmlCleaner.getContentHtml(url,  doc.select("html body table tbody tr td table tbody tr td table tbody tr td font"));
		}
		clickNum = doc.select("b[style=color:#ff6600]").text();
		bbs.setTitle(title);
		bbs.setContent(content);
		bbs.setDate(date);
		/**
		 * 解析用于更新的地址
		 */
		bbs.setUpdateUrl(url);
		bbs.setClickNum(NumberUtils.parseInt(clickNum));
		return bbs;
	}

	
	public NewsMeta Update(NewsMeta meta) {
		if(meta!=null){
			String updateUrl = meta.getUpdateUrl();
			if (StringUtils.isNotBlank(updateUrl)){
				UrlMeta urlMeta = CrawlHTML.responseToURL(updateUrl);
				return parserHtml(urlMeta);
			}
			return null;
		}
		return null;
	}

	public static void main(String[] args) {
		CzbtvBbsAnalyse a = new CzbtvBbsAnalyse();

		String url = "http://www.czbtv.com/czc/ccwh/t20110715_74888.htm";
		System.out.println(DoMainUtils.GetDomainName(url));
		if (!a.isDetailPage(url)) {
			System.out.println("不匹配规则");
		} else {
			UrlMeta meta = CrawlHTML.responseToURL(url);
			NewsMeta parserHtml = a.parserHtml(meta);
			System.out.println(parserHtml);
//			System.out.println(a.Update(parserHtml));
		}

	}
	
}
